{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/kain/Workstation/PyEnv/lib/python3.5/site-packages/sklearn/externals/joblib/_multiprocessing_helpers.py:28: UserWarning: [Errno 13] Permission denied.  joblib will operate in serial mode\n",
      "  warnings.warn('%s.  joblib will operate in serial mode' % (e,))\n",
      "/home/kain/Workstation/PyEnv/lib/python3.5/site-packages/joblib/_multiprocessing_helpers.py:28: UserWarning: [Errno 13] Permission denied.  joblib will operate in serial mode\n",
      "  warnings.warn('%s.  joblib will operate in serial mode' % (e,))\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import gc\n",
    "import time\n",
    "import category_encoders as ce\n",
    "from contextlib import contextmanager\n",
    "import lightgbm as lgb\n",
    "from sklearn.feature_selection import VarianceThreshold\n",
    "from sklearn.metrics import roc_auc_score, roc_curve\n",
    "from sklearn.model_selection import KFold, StratifiedKFold\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.feature_selection import SelectKBest\n",
    "from sklearn.feature_selection import chi2\n",
    "from scipy.cluster.vq import kmeans2, whiten\n",
    "from sklearn.preprocessing import Imputer\n",
    "from sklearn.decomposition import TruncatedSVD\n",
    "import category_encoders as ce\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.neighbors import KNeighborsRegressor\n",
    "from catboost import CatBoostClassifier\n",
    "from sklearn import preprocessing\n",
    "import warnings\n",
    "warnings.simplefilter(action='ignore', category=FutureWarning)\n",
    "\n",
    "num_rows = None\n",
    "EPS = 1e-100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# One-hot encoding for categorical columns with get_dummies\n",
    "def one_hot_encoder(df, nan_as_category = True):\n",
    "    original_columns = list(df.columns)\n",
    "    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']\n",
    "    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)\n",
    "    new_columns = [c for c in df.columns if c not in original_columns]\n",
    "    return df, new_columns\n",
    "\n",
    "# Preprocess application_train.csv and application_test.csv\n",
    "def application_train_test(num_rows = None, nan_as_category = False):\n",
    "    # Read data and merge\n",
    "    df = pd.read_csv('../data/application_train.csv', nrows= num_rows)\n",
    "    test_df = pd.read_csv('../data/application_test.csv', nrows= num_rows)\n",
    "    print(\"Train samples: {}, test samples: {}\".format(len(df), len(test_df)))\n",
    "    df = df.append(test_df).reset_index()\n",
    "    # Optional: Remove 4 applications with XNA CODE_GENDER (train set)\n",
    "   \n",
    "    \n",
    "    docs = [_f for _f in df.columns if 'FLAG_DOC' in _f]\n",
    "    live = [_f for _f in df.columns if ('FLAG_' in _f) & ('FLAG_DOC' not in _f) & ('_FLAG_' not in _f)]\n",
    "    \n",
    "    # NaN values for DAYS_EMPLOYED: 365.243 -> nan\n",
    "    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)\n",
    "\n",
    "    inc_by_org = df[['AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE']].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL']\n",
    "\n",
    "    df['NEW_CREDIT_TO_ANNUITY_RATIO'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']\n",
    "    df['NEW_CREDIT_TO_GOODS_RATIO'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']\n",
    "    df['NEW_DOC_IND_KURT'] = df[docs].kurtosis(axis=1)\n",
    "    df['NEW_LIVE_IND_SUM'] = df[live].sum(axis=1)\n",
    "    df['NEW_INC_PER_CHLD'] = df['AMT_INCOME_TOTAL'] / (1 + df['CNT_CHILDREN'])\n",
    "    df['NEW_INC_BY_ORG'] = df['ORGANIZATION_TYPE'].map(inc_by_org)\n",
    "    df['NEW_EMPLOY_TO_BIRTH_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']\n",
    "    df['NEW_ANNUITY_TO_INCOME_RATIO'] = df['AMT_ANNUITY'] / (1 + df['AMT_INCOME_TOTAL'])\n",
    "    df['NEW_SOURCES_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']\n",
    "    df['NEW_EXT_SOURCES_MEAN'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)\n",
    "    df['NEW_SCORES_STD'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)\n",
    "    df['NEW_SCORES_STD'] = df['NEW_SCORES_STD'].fillna(df['NEW_SCORES_STD'].mean())\n",
    "    df['NEW_CAR_TO_BIRTH_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH']\n",
    "    df['NEW_CAR_TO_EMPLOY_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED']\n",
    "    df['NEW_PHONE_TO_BIRTH_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_BIRTH']\n",
    "    df['NEW_PHONE_TO_BIRTH_RATIO_EMPLOYER'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_EMPLOYED']\n",
    "    df['NEW_CREDIT_TO_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']\n",
    "    \n",
    "    # Categorical features with Binary encode (0 or 1; two categories)\n",
    "    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:\n",
    "        df[bin_feature], uniques = pd.factorize(df[bin_feature])\n",
    "    # Categorical features with One-Hot encode\n",
    "    df, cat_cols = one_hot_encoder(df, nan_as_category)\n",
    "    dropcolum=['FLAG_DOCUMENT_2','FLAG_DOCUMENT_4',\n",
    "    'FLAG_DOCUMENT_5','FLAG_DOCUMENT_6','FLAG_DOCUMENT_7',\n",
    "    'FLAG_DOCUMENT_8','FLAG_DOCUMENT_9','FLAG_DOCUMENT_10', \n",
    "    'FLAG_DOCUMENT_11','FLAG_DOCUMENT_12','FLAG_DOCUMENT_13',\n",
    "    'FLAG_DOCUMENT_14','FLAG_DOCUMENT_15','FLAG_DOCUMENT_16',\n",
    "    'FLAG_DOCUMENT_17','FLAG_DOCUMENT_18','FLAG_DOCUMENT_19',\n",
    "    'FLAG_DOCUMENT_20','FLAG_DOCUMENT_21']\n",
    "    df= df.drop(dropcolum,axis=1)\n",
    "    del test_df\n",
    "    gc.collect()\n",
    "    return df\n",
    "\n",
    "# Preprocess bureau.csv and bureau_balance.csv\n",
    "def bureau_and_balance(num_rows = None, nan_as_category = True):\n",
    "    bureau = pd.read_csv('../data/bureau.csv', nrows = num_rows)\n",
    "    bb = pd.read_csv('../data/bureau_balance.csv', nrows = num_rows)\n",
    "    bb, bb_cat = one_hot_encoder(bb, nan_as_category)\n",
    "    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)\n",
    "    \n",
    "    # Bureau balance: Perform aggregations and merge with bureau.csv\n",
    "    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}\n",
    "    for col in bb_cat:\n",
    "        bb_aggregations[col] = ['mean']\n",
    "    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)\n",
    "    bb_agg.columns = pd.Index([e[0] + \"_\" + e[1].upper() for e in bb_agg.columns.tolist()])\n",
    "    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')\n",
    "    bureau.drop(['SK_ID_BUREAU'], axis=1, inplace= True)\n",
    "    del bb, bb_agg\n",
    "    gc.collect()\n",
    "    \n",
    "    # Bureau and bureau_balance numeric features\n",
    "    num_aggregations = {\n",
    "        'DAYS_CREDIT': [ 'mean', 'var'],\n",
    "        'DAYS_CREDIT_ENDDATE': [ 'mean'],\n",
    "        'DAYS_CREDIT_UPDATE': ['mean'],\n",
    "        'CREDIT_DAY_OVERDUE': ['mean'],\n",
    "        'AMT_CREDIT_MAX_OVERDUE': ['mean'],\n",
    "        'AMT_CREDIT_SUM': [ 'mean', 'sum'],\n",
    "        'AMT_CREDIT_SUM_DEBT': [ 'mean', 'sum'],\n",
    "        'AMT_CREDIT_SUM_OVERDUE': ['mean'],\n",
    "        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],\n",
    "        'AMT_ANNUITY': ['max', 'mean'],\n",
    "        'CNT_CREDIT_PROLONG': ['sum'],\n",
    "        'MONTHS_BALANCE_MIN': ['min'],\n",
    "        'MONTHS_BALANCE_MAX': ['max'],\n",
    "        'MONTHS_BALANCE_SIZE': ['mean', 'sum']\n",
    "    }\n",
    "    # Bureau and bureau_balance categorical features\n",
    "    cat_aggregations = {}\n",
    "    for cat in bureau_cat: cat_aggregations[cat] = ['mean']\n",
    "    for cat in bb_cat: cat_aggregations[cat + \"_MEAN\"] = ['mean']\n",
    "    \n",
    "    bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})\n",
    "    bureau_agg.columns = pd.Index(['BURO_' + e[0] + \"_\" + e[1].upper() for e in bureau_agg.columns.tolist()])\n",
    "    # Bureau: Active credits - using only numerical aggregations\n",
    "    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]\n",
    "    active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)\n",
    "    active_agg.columns = pd.Index(['ACTIVE_' + e[0] + \"_\" + e[1].upper() for e in active_agg.columns.tolist()])\n",
    "    bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')\n",
    "    del active, active_agg\n",
    "    gc.collect()\n",
    "    # Bureau: Closed credits - using only numerical aggregations\n",
    "    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]\n",
    "    closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)\n",
    "    closed_agg.columns = pd.Index(['CLOSED_' + e[0] + \"_\" + e[1].upper() for e in closed_agg.columns.tolist()])\n",
    "    bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')\n",
    "    del closed, closed_agg, bureau\n",
    "    gc.collect()\n",
    "    return bureau_agg\n",
    "\n",
    "# Preprocess previous_applications.csv\n",
    "def previous_applications(num_rows = None, nan_as_category = True):\n",
    "    prev = pd.read_csv('../data/previous_application.csv', nrows = num_rows)\n",
    "    prev, cat_cols = one_hot_encoder(prev, nan_as_category= True)\n",
    "    # Days 365.243 values -> nan\n",
    "    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)\n",
    "    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)\n",
    "    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)\n",
    "    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)\n",
    "    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)\n",
    "    # Add feature: value ask / value received percentage\n",
    "    prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']\n",
    "    # Previous applications numeric features\n",
    "    num_aggregations = {\n",
    "        'AMT_ANNUITY': [ 'max', 'mean'],\n",
    "        'AMT_APPLICATION': [ 'max','mean'],\n",
    "        'AMT_CREDIT': [ 'max', 'mean'],\n",
    "        'APP_CREDIT_PERC': [ 'max', 'mean'],\n",
    "        'AMT_DOWN_PAYMENT': [ 'max', 'mean'],\n",
    "        'AMT_GOODS_PRICE': [ 'max', 'mean'],\n",
    "        'HOUR_APPR_PROCESS_START': [ 'max', 'mean'],\n",
    "        'RATE_DOWN_PAYMENT': [ 'max', 'mean'],\n",
    "        'DAYS_DECISION': [ 'max', 'mean'],\n",
    "        'CNT_PAYMENT': ['mean', 'sum'],\n",
    "    }\n",
    "    # Previous applications categorical features\n",
    "    cat_aggregations = {}\n",
    "    for cat in cat_cols:\n",
    "        cat_aggregations[cat] = ['mean']\n",
    "    \n",
    "    prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})\n",
    "    prev_agg.columns = pd.Index(['PREV_' + e[0] + \"_\" + e[1].upper() for e in prev_agg.columns.tolist()])\n",
    "    # Previous Applications: Approved Applications - only numerical features\n",
    "    approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]\n",
    "    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)\n",
    "    approved_agg.columns = pd.Index(['APPROVED_' + e[0] + \"_\" + e[1].upper() for e in approved_agg.columns.tolist()])\n",
    "    prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')\n",
    "    # Previous Applications: Refused Applications - only numerical features\n",
    "    refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]\n",
    "    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)\n",
    "    refused_agg.columns = pd.Index(['REFUSED_' + e[0] + \"_\" + e[1].upper() for e in refused_agg.columns.tolist()])\n",
    "    prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')\n",
    "    del refused, refused_agg, approved, approved_agg, prev\n",
    "    gc.collect()\n",
    "    return prev_agg\n",
    "\n",
    "# Preprocess POS_CASH_balance.csv\n",
    "def pos_cash(num_rows = None, nan_as_category = True):\n",
    "    pos = pd.read_csv('../data/POS_CASH_balance.csv', nrows = num_rows)\n",
    "    pos, cat_cols = one_hot_encoder(pos, nan_as_category= True)\n",
    "    # Features\n",
    "    aggregations = {\n",
    "        'MONTHS_BALANCE': ['max', 'mean', 'size'],\n",
    "        'SK_DPD': ['max', 'mean'],\n",
    "        'SK_DPD_DEF': ['max', 'mean']\n",
    "    }\n",
    "    for cat in cat_cols:\n",
    "        aggregations[cat] = ['mean']\n",
    "    \n",
    "    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)\n",
    "    pos_agg.columns = pd.Index(['POS_' + e[0] + \"_\" + e[1].upper() for e in pos_agg.columns.tolist()])\n",
    "    # Count pos cash accounts\n",
    "    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()\n",
    "    del pos\n",
    "    gc.collect()\n",
    "    return pos_agg\n",
    "    \n",
    "# Preprocess installments_payments.csv\n",
    "def installments_payments(num_rows = None, nan_as_category = True):\n",
    "    ins = pd.read_csv('../data/installments_payments.csv', nrows = num_rows)\n",
    "    ins, cat_cols = one_hot_encoder(ins, nan_as_category= True)\n",
    "    # Percentage and difference paid in each installment (amount paid and installment value)\n",
    "    ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']\n",
    "    ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']\n",
    "    # Days past due and days before due (no negative values)\n",
    "    ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']\n",
    "    ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']\n",
    "    ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)\n",
    "    ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)\n",
    "    # Features: Perform aggregations\n",
    "    aggregations = {\n",
    "        'NUM_INSTALMENT_VERSION': ['nunique'],\n",
    "        'DPD': ['max', 'mean', 'sum'],\n",
    "        'DBD': ['max', 'mean', 'sum'],\n",
    "        'PAYMENT_PERC': [ 'mean',  'var'],\n",
    "        'PAYMENT_DIFF': [ 'mean', 'var'],\n",
    "        'AMT_INSTALMENT': ['max', 'mean', 'sum'],\n",
    "        'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],\n",
    "        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']\n",
    "    }\n",
    "    for cat in cat_cols:\n",
    "        aggregations[cat] = ['mean']\n",
    "    ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)\n",
    "    ins_agg.columns = pd.Index(['INSTAL_' + e[0] + \"_\" + e[1].upper() for e in ins_agg.columns.tolist()])\n",
    "    # Count installments accounts\n",
    "    ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()\n",
    "    del ins\n",
    "    gc.collect()\n",
    "    return ins_agg\n",
    "\n",
    "# Preprocess credit_card_balance.csv\n",
    "def credit_card_balance(num_rows = None, nan_as_category = True):\n",
    "    cc = pd.read_csv('../data/credit_card_balance.csv', nrows = num_rows)\n",
    "    cc, cat_cols = one_hot_encoder(cc, nan_as_category= True)\n",
    "    # General aggregations\n",
    "    cc.drop(['SK_ID_PREV'], axis= 1, inplace = True)\n",
    "    cc_agg = cc.groupby('SK_ID_CURR').agg([ 'max', 'mean', 'sum', 'var'])\n",
    "    cc_agg.columns = pd.Index(['CC_' + e[0] + \"_\" + e[1].upper() for e in cc_agg.columns.tolist()])\n",
    "    # Count credit card lines\n",
    "    cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()\n",
    "    del cc\n",
    "    gc.collect()\n",
    "    return cc_agg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train samples: 307511, test samples: 48744\n"
     ]
    }
   ],
   "source": [
    "df = application_train_test(num_rows=num_rows, nan_as_category=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(356255, 240)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "bureau = bureau_and_balance(num_rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "21"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = df.join(bureau, how='left', on='SK_ID_CURR')\n",
    "del bureau\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Previous applications df shape: (338857, 219)\n",
      "Pos-cash balance df shape: (337252, 18)\n",
      "Installments payments df shape: (339587, 22)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "7"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "prev = previous_applications(num_rows)\n",
    "print(\"Previous applications df shape:\", prev.shape)\n",
    "df = df.join(prev, how='left', on='SK_ID_CURR')\n",
    "del prev\n",
    "gc.collect()\n",
    "\n",
    "pos = pos_cash(num_rows)\n",
    "print(\"Pos-cash balance df shape:\", pos.shape)\n",
    "df = df.join(pos, how='left', on='SK_ID_CURR')\n",
    "del pos\n",
    "gc.collect()\n",
    "\n",
    "ins = installments_payments(num_rows)\n",
    "print(\"Installments payments df shape:\", ins.shape)\n",
    "df = df.join(ins, how='left', on='SK_ID_CURR')\n",
    "del ins\n",
    "gc.collect()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Credit card balance df shape: (103558, 113)\n",
      "(356255, 707)\n"
     ]
    }
   ],
   "source": [
    "cc = credit_card_balance(num_rows)\n",
    "print(\"Credit card balance df shape:\", cc.shape)\n",
    "df = df.join(cc, how='left', on='SK_ID_CURR')\n",
    "del cc\n",
    "gc.collect()\n",
    "\n",
    "print(df.shape)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(356255, 707)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_file_path = \"Level_1_stack/test_knnRegrssor_8.csv\"\n",
    "validation_file_path = 'Level_1_stack/validation_knnRegrssor_8.csv'\n",
    "num_folds = 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_csv('/media/limbo/Home-Credit/data/application_train.csv.zip', nrows= num_rows)\n",
    "n_train = train.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(356255, 707)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def safe_int(x):\n",
    "    try:\n",
    "        res = int(x)\n",
    "    except:\n",
    "        res = 0\n",
    "    return res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "index                                           int64\n",
       "AMT_ANNUITY                                   float64\n",
       "AMT_CREDIT                                    float64\n",
       "AMT_GOODS_PRICE                               float64\n",
       "AMT_INCOME_TOTAL                              float64\n",
       "AMT_REQ_CREDIT_BUREAU_DAY                     float64\n",
       "AMT_REQ_CREDIT_BUREAU_HOUR                    float64\n",
       "AMT_REQ_CREDIT_BUREAU_MON                     float64\n",
       "AMT_REQ_CREDIT_BUREAU_QRT                     float64\n",
       "AMT_REQ_CREDIT_BUREAU_WEEK                    float64\n",
       "AMT_REQ_CREDIT_BUREAU_YEAR                    float64\n",
       "APARTMENTS_AVG                                float64\n",
       "APARTMENTS_MEDI                               float64\n",
       "APARTMENTS_MODE                               float64\n",
       "BASEMENTAREA_AVG                              float64\n",
       "BASEMENTAREA_MEDI                             float64\n",
       "BASEMENTAREA_MODE                             float64\n",
       "CNT_CHILDREN                                    int64\n",
       "CNT_FAM_MEMBERS                               float64\n",
       "CODE_GENDER                                     int64\n",
       "COMMONAREA_AVG                                float64\n",
       "COMMONAREA_MEDI                               float64\n",
       "COMMONAREA_MODE                               float64\n",
       "DAYS_BIRTH                                      int64\n",
       "DAYS_EMPLOYED                                 float64\n",
       "DAYS_ID_PUBLISH                                 int64\n",
       "DAYS_LAST_PHONE_CHANGE                        float64\n",
       "DAYS_REGISTRATION                             float64\n",
       "DEF_30_CNT_SOCIAL_CIRCLE                      float64\n",
       "DEF_60_CNT_SOCIAL_CIRCLE                      float64\n",
       "                                               ...   \n",
       "CC_NAME_CONTRACT_STATUS_Active_VAR            float64\n",
       "CC_NAME_CONTRACT_STATUS_Approved_MAX          float64\n",
       "CC_NAME_CONTRACT_STATUS_Approved_MEAN         float64\n",
       "CC_NAME_CONTRACT_STATUS_Approved_SUM          float64\n",
       "CC_NAME_CONTRACT_STATUS_Approved_VAR          float64\n",
       "CC_NAME_CONTRACT_STATUS_Completed_MAX         float64\n",
       "CC_NAME_CONTRACT_STATUS_Completed_MEAN        float64\n",
       "CC_NAME_CONTRACT_STATUS_Completed_SUM         float64\n",
       "CC_NAME_CONTRACT_STATUS_Completed_VAR         float64\n",
       "CC_NAME_CONTRACT_STATUS_Demand_MAX            float64\n",
       "CC_NAME_CONTRACT_STATUS_Demand_MEAN           float64\n",
       "CC_NAME_CONTRACT_STATUS_Demand_SUM            float64\n",
       "CC_NAME_CONTRACT_STATUS_Demand_VAR            float64\n",
       "CC_NAME_CONTRACT_STATUS_Refused_MAX           float64\n",
       "CC_NAME_CONTRACT_STATUS_Refused_MEAN          float64\n",
       "CC_NAME_CONTRACT_STATUS_Refused_SUM           float64\n",
       "CC_NAME_CONTRACT_STATUS_Refused_VAR           float64\n",
       "CC_NAME_CONTRACT_STATUS_Sent proposal_MAX     float64\n",
       "CC_NAME_CONTRACT_STATUS_Sent proposal_MEAN    float64\n",
       "CC_NAME_CONTRACT_STATUS_Sent proposal_SUM     float64\n",
       "CC_NAME_CONTRACT_STATUS_Sent proposal_VAR     float64\n",
       "CC_NAME_CONTRACT_STATUS_Signed_MAX            float64\n",
       "CC_NAME_CONTRACT_STATUS_Signed_MEAN           float64\n",
       "CC_NAME_CONTRACT_STATUS_Signed_SUM            float64\n",
       "CC_NAME_CONTRACT_STATUS_Signed_VAR            float64\n",
       "CC_NAME_CONTRACT_STATUS_nan_MAX               float64\n",
       "CC_NAME_CONTRACT_STATUS_nan_MEAN              float64\n",
       "CC_NAME_CONTRACT_STATUS_nan_SUM               float64\n",
       "CC_NAME_CONTRACT_STATUS_nan_VAR               float64\n",
       "CC_COUNT                                      float64\n",
       "Length: 707, dtype: object"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting LightGBM. Train shape: (307511, 707), test shape: (48744, 707)\n",
      "(307511, 23)\n",
      "(246008, 23) (61503, 23) (48744, 23)\n",
      "Fold  1 AUC : 0.544661\n",
      "(246009, 23) (61502, 23) (48744, 23)\n",
      "Fold  2 AUC : 0.550613\n",
      "(246009, 23) (61502, 23) (48744, 23)\n",
      "Fold  3 AUC : 0.546722\n",
      "(246009, 23) (61502, 23) (48744, 23)\n",
      "Fold  4 AUC : 0.544014\n",
      "(246009, 23) (61502, 23) (48744, 23)\n",
      "Fold  5 AUC : 0.551783\n",
      "Full AUC score 0.547518\n"
     ]
    }
   ],
   "source": [
    "encoding = 'ohe'\n",
    "\n",
    "train_df = df.iloc[0:n_train]\n",
    "test_df = df.iloc[n_train:]\n",
    "\n",
    "print(\"Starting LightGBM. Train shape: {}, test shape: {}\".format(train_df.shape, test_df.shape))\n",
    "gc.collect()\n",
    "# Cross validation model\n",
    "folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)\n",
    "# Create arrays and dataframes to store results\n",
    "oof_preds = np.zeros(train_df.shape[0])\n",
    "sub_preds = np.zeros(test_df.shape[0])\n",
    "feature_importance_df = pd.DataFrame()\n",
    "\n",
    "feats_0 = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]\n",
    "\n",
    "feats = [col for col in feats_0 if df[col].dtype == 'int64']\n",
    "\n",
    "\n",
    "print(train_df[feats].shape)\n",
    "for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):\n",
    "        \n",
    "      \n",
    "        categorical_columns = [col for col in df.columns if df[col].dtype == 'object']\n",
    "        \n",
    "        if encoding == 'ohe':\n",
    "            \n",
    "            enc = ce.TargetEncoder(impute_missing=True, cols=categorical_columns).fit(train_df[feats].iloc[train_idx],\n",
    "                                                                                       train_df['TARGET'].iloc[train_idx])\n",
    "            x_train = enc.transform(train_df[feats].iloc[train_idx]).replace([-np.inf, np.inf]).fillna(-999)\n",
    "            x_valid = enc.transform(train_df[feats].iloc[valid_idx]).replace([-np.inf, np.inf]).fillna(-999)\n",
    "            x_test = enc.transform(test_df[feats]).replace([-np.inf, np.inf]).fillna(-999)\n",
    "            \n",
    "    \n",
    "            gc.collect()\n",
    "            \n",
    "#             svd = TruncatedSVD(n_components=150, random_state=666, n_iter=5)\n",
    "#             svd.fit(x_train)\n",
    "#             x_train = svd.transform(x_train)\n",
    "#             x_valid = svd.transform(x_valid)\n",
    "#             x_test = svd.transform(x_test)\n",
    "            \n",
    "            scaler = preprocessing.RobustScaler(quantile_range=(3.0, 97.0), with_scaling=True, with_centering=True)\n",
    "            scaler.fit(x_train)\n",
    "            x_train = scaler.transform(x_train)\n",
    "            x_valid = scaler.transform(x_valid)\n",
    "            x_test = scaler.transform(x_test)\n",
    "            \n",
    "            print(x_train.shape, x_valid.shape, x_test.shape)\n",
    "            \n",
    "            \n",
    "        clf = KNeighborsRegressor(n_neighbors=2 ** 3, weights='distance', n_jobs=-1)\n",
    "        \n",
    "        clf.fit(x_train, train_df['TARGET'].iloc[train_idx])\n",
    "        \n",
    "        \n",
    "        oof_preds[valid_idx] = clf.predict(x_valid)\n",
    "        sub_preds += clf.predict(x_test) / folds.n_splits\n",
    "\n",
    "        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(train_df['TARGET'].iloc[valid_idx], oof_preds[valid_idx])))\n",
    "        del clf\n",
    "        gc.collect()\n",
    "\n",
    "print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))\n",
    "             \n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Full AUC score 0.547518\n"
     ]
    }
   ],
   "source": [
    "print('Full AUC score %.6f' % roc_auc_score(list(map(safe_int, train_df['TARGET'])), oof_preds))\n",
    "        \n",
    "sub_df = test_df[['SK_ID_CURR']].copy()\n",
    "sub_df['TARGET'] = sub_preds\n",
    "sub_df[['SK_ID_CURR', 'TARGET']].to_csv(test_file_path, index= False)\n",
    "\n",
    "val_df = train_df[['SK_ID_CURR', 'TARGET']].copy()\n",
    "val_df['TARGET'] = oof_preds\n",
    "val_df[['SK_ID_CURR', 'TARGET']].to_csv(validation_file_path, index= False)        \n",
    "            \n",
    "            \n",
    "   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(48744,)"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sub_preds.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(307511,)"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "oof_preds.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
